In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.feature_selection as fs
In [3]:
df=pd.read_csv(r'train.csv')
df.shape
Out[3]:
(600000, 15)
In [7]:
df.dtypes,df.describe()
Out[7]:
(ID                                        int64
 age                                     float64
 sex                                       int64
 chest                                   float64
 resting_blood_pressure                  float64
 serum_cholestoral                       float64
 fasting_blood_sugar                       int64
 resting_electrocardiographic_results      int64
 maximum_heart_rate_achieved             float64
 exercise_induced_angina                   int64
 oldpeak                                 float64
 slope                                     int64
 number_of_major_vessels                   int64
 thal                                      int64
 class                                     int64
 dtype: object,
                   ID            age            sex          chest  \
 count  600000.000000  600000.000000  600000.000000  600000.000000   
 mean   299999.500000      54.426085       0.677447       3.170247   
 std    173205.225094       9.086041       0.467454       0.949618   
 min         0.000000      26.061695       0.000000      -0.538498   
 25%    149999.750000      48.078493       0.000000       3.000000   
 50%    299999.500000      55.133425       1.000000       3.000000   
 75%    449999.250000      60.663775       1.000000       4.000000   
 max    599999.000000      79.591647       1.000000       4.000000   
 
        resting_blood_pressure  serum_cholestoral  fasting_blood_sugar  \
 count           600000.000000      600000.000000        600000.000000   
 mean               131.340485         249.592420             0.150757   
 std                 17.842450          51.699976             0.357812   
 min                 82.918527          98.326263             0.000000   
 25%                119.924094         216.475964             0.000000   
 50%                129.727941         244.233699             0.000000   
 75%                139.915216         274.345453             0.000000   
 max                209.673650         527.755764             1.000000   
 
        resting_electrocardiographic_results  maximum_heart_rate_achieved  \
 count                         600000.000000                600000.000000   
 mean                               1.020148                   149.592069   
 std                                0.994204                    23.072593   
 min                                0.000000                    61.844168   
 25%                                0.000000                   135.704237   
 50%                                2.000000                   153.224828   
 75%                                2.000000                   165.165497   
 max                                2.000000                   208.735196   
 
        exercise_induced_angina        oldpeak          slope  \
 count            600000.000000  600000.000000  600000.000000   
 mean                  0.333502       1.051391       1.596033   
 std                   0.471464       1.144288       0.629821   
 min                   0.000000      -0.806788       1.000000   
 25%                   0.000000       0.000000       1.000000   
 50%                   0.000000       0.811101       2.000000   
 75%                   1.000000       1.674081       2.000000   
 max                   1.000000       6.803372       3.000000   
 
        number_of_major_vessels           thal          class  
 count            600000.000000  600000.000000  600000.000000  
 mean                  0.681303       4.711378       0.444185  
 std                   0.950669       1.934766       0.496875  
 min                   0.000000       3.000000       0.000000  
 25%                   0.000000       3.000000       0.000000  
 50%                   0.000000       3.000000       0.000000  
 75%                   1.000000       7.000000       1.000000  
 max                   3.000000       7.000000       1.000000  )
In [8]:
df.isnull().sum()
Out[8]:
ID                                      0
age                                     0
sex                                     0
chest                                   0
resting_blood_pressure                  0
serum_cholestoral                       0
fasting_blood_sugar                     0
resting_electrocardiographic_results    0
maximum_heart_rate_achieved             0
exercise_induced_angina                 0
oldpeak                                 0
slope                                   0
number_of_major_vessels                 0
thal                                    0
class                                   0
dtype: int64
In [10]:
df['class'].value_counts().plot(kind='bar')
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1815d133710>
In [11]:
df.corr()
Out[11]:
ID age sex chest resting_blood_pressure serum_cholestoral fasting_blood_sugar resting_electrocardiographic_results maximum_heart_rate_achieved exercise_induced_angina oldpeak slope number_of_major_vessels thal class
ID 1.000000 0.000785 -0.002925 -0.000878 0.000389 -0.001348 0.001213 -0.000416 0.000314 -0.000950 -0.000411 0.000105 -0.001402 -0.001594 -0.000345
age 0.000785 1.000000 0.064908 0.093444 0.025569 0.027536 -0.002035 0.040188 -0.305867 0.090434 0.086414 0.071041 0.098336 0.115100 0.222008
sex -0.002925 0.064908 1.000000 0.124861 0.031819 0.034927 -0.005528 0.052360 -0.108324 0.120282 0.116016 0.093245 0.131021 0.379811 0.292998
chest -0.000878 0.093444 0.124861 1.000000 0.048882 0.047981 -0.005523 0.075478 -0.155118 0.350996 0.164566 0.132378 0.187130 0.218679 0.419546
resting_blood_pressure 0.000389 0.025569 0.031819 0.048882 1.000000 0.013756 -0.000887 0.018497 -0.045325 0.046772 0.045875 0.038072 0.052041 0.058589 0.115044
serum_cholestoral -0.001348 0.027536 0.034927 0.047981 0.013756 1.000000 -0.001501 0.023376 -0.043411 0.046789 0.045647 0.038805 0.052265 0.059556 0.117765
fasting_blood_sugar 0.001213 -0.002035 -0.005528 -0.005523 -0.000887 -0.001501 1.000000 -0.002209 0.006499 -0.009223 -0.007197 -0.004878 -0.007578 -0.008597 -0.015911
resting_electrocardiographic_results -0.000416 0.040188 0.052360 0.075478 0.018497 0.023376 -0.002209 1.000000 -0.066058 0.072797 0.069246 0.054973 0.081892 0.094217 0.180097
maximum_heart_rate_achieved 0.000314 -0.305867 -0.108324 -0.155118 -0.045325 -0.043411 0.006499 -0.066058 1.000000 -0.152294 -0.145626 -0.117064 -0.165890 -0.189321 -0.368760
exercise_induced_angina -0.000950 0.090434 0.120282 0.350996 0.046772 0.046789 -0.009223 0.072797 -0.152294 1.000000 0.161057 0.129972 0.183326 0.211677 0.411014
oldpeak -0.000411 0.086414 0.116016 0.164566 0.045875 0.045647 -0.007197 0.069246 -0.145626 0.161057 1.000000 0.522338 0.174438 0.204169 0.392788
slope 0.000105 0.071041 0.093245 0.132378 0.038072 0.038805 -0.004878 0.054973 -0.117064 0.129972 0.522338 1.000000 0.140123 0.163710 0.316084
number_of_major_vessels -0.001402 0.098336 0.131021 0.187130 0.052041 0.052265 -0.007578 0.081892 -0.165890 0.183326 0.174438 0.140123 1.000000 0.232688 0.447143
thal -0.001594 0.115100 0.379811 0.218679 0.058589 0.059556 -0.008597 0.094217 -0.189321 0.211677 0.204169 0.163710 0.232688 1.000000 0.516555
class -0.000345 0.222008 0.292998 0.419546 0.115044 0.117765 -0.015911 0.180097 -0.368760 0.411014 0.392788 0.316084 0.447143 0.516555 1.000000
In [13]:
df.corr()['class'].plot(kind='bar')
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1815d1af390>
In [14]:
df.nunique()
Out[14]:
ID                                      600000
age                                     594106
sex                                          2
chest                                   133009
resting_blood_pressure                  596241
serum_cholestoral                       598797
fasting_blood_sugar                          2
resting_electrocardiographic_results         3
maximum_heart_rate_achieved             597583
exercise_induced_angina                      2
oldpeak                                 384255
slope                                        3
number_of_major_vessels                      4
thal                                         3
class                                        2
dtype: int64
In [19]:
age_bins=pd.cut(df.age,bins=10)
In [21]:
df['age_bins']=age_bins.cat.codes
In [22]:
df.corr()['class'].plot(kind='bar')
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1815d252470>
In [24]:
cont_df=df.select_dtypes('float64')
In [25]:
cont_df.plot()
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x1815d7f4978>
In [58]:
for col in cont_df.columns:
    fig=plt.figure(figsize=(20,10))
    plt.scatter(cont_df[col].index,cont_df[col],c=df['class'])
    plt.title(col)
    plt.show()
In [61]:
plt.hist(one['chest'])
Out[61]:
(array([3.5000e+01, 4.0500e+02, 2.4290e+03, 6.8300e+03, 9.3110e+03,
        6.1260e+03, 1.9490e+03, 3.8971e+04, 1.5000e+01, 2.0044e+05]),
 array([-0.168948 ,  0.2479468,  0.6648416,  1.0817364,  1.4986312,
         1.915526 ,  2.3324208,  2.7493156,  3.1662104,  3.5831052,
         4.       ]),
 <a list of 10 Patch objects>)
In [65]:
one,zero=cont_df[df['class']==1].copy(),cont_df[df['class']==0].copy()
for col in cont_df.columns:
    fig=plt.figure(figsize=(20,10))
    plt.hist(one[col],alpha=0.5,color='red')
    plt.hist(zero[col],alpha=0.5,color='green')
    plt.title(col)
    plt.show()
In [4]:
cat_df=df.select_dtypes('int64')
In [36]:
cat_df.groupby('class').agg('count')
Out[36]:
ID sex fasting_blood_sugar resting_electrocardiographic_results exercise_induced_angina slope number_of_major_vessels thal
class
0 333489 333489 333489 333489 333489 333489 333489 333489
1 266511 266511 266511 266511 266511 266511 266511 266511
In [37]:
cat_df.columns
Out[37]:
Index(['ID', 'sex', 'fasting_blood_sugar',
       'resting_electrocardiographic_results', 'exercise_induced_angina',
       'slope', 'number_of_major_vessels', 'thal', 'class'],
      dtype='object')
In [43]:
cat_df.groupby(['class','sex','fasting_blood_sugar','resting_electrocardiographic_results','exercise_induced_angina','slope','number_of_major_vessels','thal']).agg('count')
Out[43]:
ID
class sex fasting_blood_sugar resting_electrocardiographic_results exercise_induced_angina slope number_of_major_vessels thal
0 0 0 0 0 1 0 3 29081
6 207
7 1107
1 3 4952
6 42
7 167
2 3 1762
6 18
7 83
3 3 841
6 6
7 35
2 0 3 13078
6 117
7 470
1 3 2200
6 14
7 102
2 3 863
6 5
7 33
3 3 364
7 15
3 0 3 2877
6 17
7 110
1 3 476
6 5
7 21
2 3 200
... ... ... ... ... ... ... ... ...
1 1 1 2 1 1 2 3 156
6 62
7 490
3 3 87
6 28
7 277
2 0 3 525
6 190
7 1484
1 3 482
6 158
7 1433
2 3 358
6 136
7 987
3 3 208
6 82
7 589
3 0 3 66
6 29
7 237
1 3 72
6 25
7 204
2 3 66
6 15
7 143
3 3 30
6 14
7 102

1471 rows × 1 columns

In [52]:
cat_df.groupby(['class','sex','fasting_blood_sugar','resting_electrocardiographic_results','exercise_induced_angina','slope','number_of_major_vessels','thal']).agg('count').plot(kind='bar')
plt.xticks(rotation='vertical')
plt.show()
In [49]:
cat_df.groupby(['class','sex','fasting_blood_sugar','resting_electrocardiographic_results']).agg('count').plot(kind='bar')
plt.xticks(rotation='vertical')
plt.show()
In [8]:
cat_df.columns[1:-1]
Out[8]:
Index(['sex', 'fasting_blood_sugar', 'resting_electrocardiographic_results',
       'exercise_induced_angina', 'slope', 'number_of_major_vessels', 'thal'],
      dtype='object')
In [10]:
ss=cat_df.groupby(['class','slope'])['ID'].count()
In [9]:
for col in cat_df.columns[1:-1]:
    ss=cat_df.groupby(['class',col])['ID'].count()
    fig, ax = plt.subplots()    
    ax.barh(list(range(ss.shape[0])),ss.values)
    for i, v in enumerate(ss):
        #ax.barh(ss.index,ss.values)

        ax.text(v + 3, i + .25, str(v), color='blue', fontweight='bold')
    plt.ylabel(ss.index.values)
    plt.title(col)
    plt.show()
In [29]:
import seaborn as sns
In [9]:
sns.countplot(cat_df['slope'])
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x26f4246bcc0>

Feature Engineering with Model building

In [13]:
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.model_selection import train_test_split
In [12]:
clf=xgb.XGBClassifier()
In [19]:
df.iloc[:,1:-1].columns,df.iloc[:,-1].name
Out[19]:
(Index(['age', 'sex', 'chest', 'resting_blood_pressure', 'serum_cholestoral',
        'fasting_blood_sugar', 'resting_electrocardiographic_results',
        'maximum_heart_rate_achieved', 'exercise_induced_angina', 'oldpeak',
        'slope', 'number_of_major_vessels', 'thal'],
       dtype='object'), 'class')
In [21]:
X_train, X_test, y_train, y_test=train_test_split(df.iloc[:,1:-1],df.iloc[:,-1])
In [22]:
clf.fit(X_train,y_train)
Out[22]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
In [23]:
pred=clf.predict(X_test)
In [30]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred)
sns.heatmap(cm)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x2e0a710c780>
In [31]:
plot_importance(clf)
Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x2e0a72f3588>
In [32]:
plot_tree(clf)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
F:\anaconda\lib\site-packages\graphviz-0.8.4-py3.6.egg\graphviz\backend.py in pipe(engine, format, data, quiet)
    158             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
--> 159             **POPEN_KWARGS)
    160     except OSError as e:

F:\anaconda\lib\subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
    708                                 errread, errwrite,
--> 709                                 restore_signals, start_new_session)
    710         except:

F:\anaconda\lib\subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_start_new_session)
    996                                          os.fspath(cwd) if cwd is not None else None,
--> 997                                          startupinfo)
    998             finally:

FileNotFoundError: [WinError 2] The system cannot find the file specified

During handling of the above exception, another exception occurred:

ExecutableNotFound                        Traceback (most recent call last)
<ipython-input-32-5a48441beb57> in <module>()
----> 1 plot_tree(clf)

F:\anaconda\lib\site-packages\xgboost\plotting.py in plot_tree(booster, fmap, num_trees, rankdir, ax, **kwargs)
    259 
    260     s = BytesIO()
--> 261     s.write(g.pipe(format='png'))
    262     s.seek(0)
    263     img = image.imread(s)

F:\anaconda\lib\site-packages\graphviz-0.8.4-py3.6.egg\graphviz\files.py in pipe(self, format)
    123         data = text_type(self.source).encode(self._encoding)
    124 
--> 125         outs = backend.pipe(self._engine, format, data)
    126 
    127         return outs

F:\anaconda\lib\site-packages\graphviz-0.8.4-py3.6.egg\graphviz\backend.py in pipe(engine, format, data, quiet)
    160     except OSError as e:
    161         if e.errno == errno.ENOENT:
--> 162             raise ExecutableNotFound(args)
    163         else:  # pragma: no cover
    164             raise

ExecutableNotFound: failed to execute ['dot', '-Tpng'], make sure the Graphviz executables are on your systems' PATH